Importing Libraries and Data Preprocessing
# import libraries
library(readxl)
library(ggplot2)
library(car)
library(reshape2)
library(haven)
library(dplyr)
library(zoo)
library(metafor)
library(tidyr)
library(forestplot)
formatify <- function(df, code) {
df <- df[df$Code == code, c("Year", tail(names(df), 1))]
colnames(df)[ncol(df)] <- "value"
return(df)
}
print_info <- function(df) {
cat(deparse(substitute(df)), "\n")
col1 <- df[,1]
cat("[", min(col1), ", ", max(col1), "]", "\n")
head(df)
}
adjusted_net_savings <- read.csv("data/adjusted-net-saving-current-us-dollars.csv")
adjusted_net_savings <- formatify(adjusted_net_savings, "ISR")
annual_co2_emissions <- read.csv("data/annual-co2-emissions-per-country.csv")
annual_co2_emissions <- formatify(annual_co2_emissions, "ISR")
child_mortality <- read.csv("data/child-mortality-around-the-world.csv")
child_mortality <- formatify(child_mortality, "ISR")
gdp_per_capita <- read.csv("data/gdp-per-capita-worldbank.csv")
gdp_per_capita <- formatify(gdp_per_capita, "ISR")
government_revenue_sharegdp <- read.csv("data/government-revenue-as-a-share-of-gdp.csv")
government_revenue_sharegdp <- formatify(government_revenue_sharegdp, "ISR")
GDP <- read.csv("data/gross-domestic-product.csv")
GDP <- formatify(GDP, "ISR")
human_development_index <- read.csv("data/human-development-index.csv")
human_development_index <- formatify(human_development_index, "ISR")
labor_productivity <- read.csv("data/labor-productivity-per-hour-PennWorldTable.csv")
labor_productivity <- formatify(labor_productivity, "ISR")
life_expectancy <- read.csv("data/life-expectancy.csv")
life_expectancy <- formatify(life_expectancy, "ISR")
research_spending_sharegdp <- read.csv("data/research-spending-gdp.csv")
research_spending_sharegdp <- formatify(research_spending_sharegdp, "ISR")
solar_electricity_percapita <- read.csv("data/solar-electricity-per-capita.csv")
solar_electricity_percapita <- formatify(solar_electricity_percapita, "ISR")
unemployment_rate <- read.csv("data/unemployment-rate.csv")
unemployment_rate <- formatify(unemployment_rate, "ISR")
education_in_govt_exp <- read.csv("data/share-of-education-in-government-expenditure.csv")
education_in_govt_exp <- formatify(education_in_govt_exp, "ISR")
labour_data <- read.csv("romer-model-data/Labor_Data.csv")
labour_data <- subset(labour_data, Country.Code == "ISR")
labour_data <- labour_data[, 5:ncol(labour_data)]
col_names <- names(labour_data)
new_col_names <- substring(col_names, 2)
names(labour_data) <- new_col_names
labour_data <- melt(labour_data)
labour_data <- labour_data[, !colSums(is.na(labour_data)) == nrow(labour_data)]
labour_data <- na.omit(labour_data)
colnames(labour_data) <- c("Year", "value")
head(labour_data)
## Year value
## 1 1990 1900701
## 2 1991 2035534
## 3 1992 2133756
## 4 1993 2252545
## 5 1994 2353183
## 6 1995 2442219
print_info(adjusted_net_savings)
## adjusted_net_savings
## [ 1970 , 2019 ]
## Year value
## 2295 1970 724273088
## 2296 1971 1260111232
## 2297 1972 2017276672
## 2298 1973 2126510976
## 2299 1974 1793309056
## 2300 1975 1352494464
print_info(annual_co2_emissions)
## annual_co2_emissions
## [ 1930 , 2021 ]
## Year value
## 14394 1930 39977
## 14395 1931 39977
## 14396 1932 50880
## 14397 1933 65417
## 14398 1934 69051
## 14399 1935 90857
print_info(child_mortality)
## child_mortality
## [ 1950 , 2021 ]
## Year value
## 7561 1950 4.29607
## 7562 1951 4.31432
## 7563 1952 4.32142
## 7564 1953 4.26406
## 7565 1954 4.20126
## 7566 1955 4.13682
print_info(gdp_per_capita)
## gdp_per_capita
## [ 1995 , 2020 ]
## Year value
## 2604 1995 26233.13
## 2605 1996 27096.08
## 2606 1997 27464.76
## 2607 1998 27970.64
## 2608 1999 28240.99
## 2609 2000 29950.32
print_info(government_revenue_sharegdp)
## government_revenue_sharegdp
## [ 2000 , 2020 ]
## Year value
## 1344 2000 43.76
## 1345 2001 43.54
## 1346 2002 43.25
## 1347 2003 41.84
## 1348 2004 40.91
## 1349 2005 41.00
print_info(GDP)
## GDP
## [ 1995 , 2020 ]
## Year value
## 4599 1995 139622940672
## 4600 1996 148039122944
## 4601 1997 153849528320
## 4602 1998 160307806208
## 4603 1999 166031736832
## 4604 2000 180795719680
print_info(human_development_index)
## human_development_index
## [ 1990 , 2021 ]
## Year value
## 2544 1990 0.787
## 2545 1991 0.793
## 2546 1992 0.797
## 2547 1993 0.804
## 2548 1994 0.808
## 2549 1995 0.811
print_info(labor_productivity)
## labor_productivity
## [ 1981 , 2019 ]
## Year value
## 1577 1981 25.19910
## 1578 1982 25.77843
## 1579 1983 26.18268
## 1580 1984 26.19285
## 1581 1985 26.88302
## 1582 1986 28.37297
print_info(life_expectancy)
## life_expectancy
## [ 1950 , 2021 ]
## Year value
## 8486 1950 68.2
## 8487 1951 68.1
## 8488 1952 68.0
## 8489 1953 68.1
## 8490 1954 68.3
## 8491 1955 68.7
print_info(research_spending_sharegdp)
## research_spending_sharegdp
## [ 1996 , 2018 ]
## Year value
## 932 1996 2.59054
## 933 1997 2.80736
## 934 1998 2.91897
## 935 1999 3.33008
## 936 2000 3.93329
## 937 2001 4.18499
print_info(solar_electricity_percapita)
## solar_electricity_percapita
## [ 1965 , 2021 ]
## Year value
## 3423 1965 0
## 3424 1966 0
## 3425 1967 0
## 3426 1968 0
## 3427 1969 0
## 3428 1970 0
print_info(unemployment_rate)
## unemployment_rate
## [ 1991 , 2021 ]
## Year value
## 2636 1991 13.39
## 2637 1992 14.08
## 2638 1993 12.74
## 2639 1994 9.93
## 2640 1995 8.78
## 2641 1996 8.46
library(dplyr)
library(tidyr)
library(purrr)
# Create a list of dataframes to merge
dfs <- list(adjusted_net_savings, annual_co2_emissions, child_mortality, gdp_per_capita,
government_revenue_sharegdp, GDP, human_development_index, labor_productivity,
life_expectancy, research_spending_sharegdp, solar_electricity_percapita, unemployment_rate,
education_in_govt_exp, labour_data)
# Get the names of the original dataframes
df_names <- c("Year", "adjusted_net_savings", "annual_co2_emissions", "child_mortality", "gdp_per_capita",
"government_revenue_sharegdp", "GDP", "human_development_index", "labor_productivity",
"life_expectancy", "research_spending_sharegdp", "solar_electricity_percapita", "unemployment_rate",
"education_in_govt_exp", "labour_data")
# Merge dataframes using the "Year" column as the key and keep the original dataframe names
merged_df <- Reduce(function(x, y) {
suffix_x <- df_names[which(df_names == deparse(substitute(x)))][1]
suffix_y <- df_names[which(df_names == deparse(substitute(y)))][1]
merge(x, y, by = "Year", all = TRUE, suffixes = c(paste0(".", suffix_x), paste0(".", suffix_y)))
}, dfs)
merged_df <- set_names(merged_df, df_names)
# Remove rows where Year <= 1960
merged_df <- subset(merged_df, Year > 1960)
copy_df <- merged_df
# Fill missing values using mean imputation
for (col in names(copy_df)[-1]) {
copy_df[is.na(copy_df[, col]), col] <- mean(copy_df[, col], na.rm = TRUE)
}
# Compute correlation matrix
cor_matrix <- cor(copy_df, use = "complete.obs")
# Plot correlation matrix
melted_matrix <- melt(cor_matrix)
melted_matrix <- melted_matrix %>%
arrange(desc(value))
ggplot(melted_matrix, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "#2166ac", mid = "white", high = "#b2182b",
midpoint = 0, na.value = "gray") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6.5),
axis.text.y = element_text(angle = 0, hjust = 1, vjust = 0.5, size = 6.5),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
legend.text = element_text(size = 12),
legend.title = element_text(size = 14)
) +
labs(title = "Correlation Matrix", x = "", y = "", fill = "Correlation")

# Create a new data frame with only the required columns
lm_df <- merged_df[c("GDP", "Year", "labour_data", "education_in_govt_exp", "labor_productivity")]
lm_df <- na.omit(lm_df)
lm_df$log_education <- log(lm_df$education_in_govt_exp)
lm_df$log_GDP <- log(lm_df$GDP)
lm_df$log_labour <- log(lm_df$labour_data)
lm2 <- lm(log_GDP ~ Year + log_labour + log_education + labor_productivity, data = lm_df)
car::vif(lm2)
## Year log_labour log_education labor_productivity
## 726.535994 730.445030 9.729554 4.087105
# Plot linear regression model
plot(lm_df$log_GDP, fitted(lm2), main = "Linear Regression Model", xlab = "Actual Values", ylab = "Predicted Values")

plot(lm2)




# Compute correlation matrix
cor_matrix <- cor(lm_df, use = "complete.obs")
# Plot correlation matrix
melted_matrix <- melt(cor_matrix)
melted_matrix <- melted_matrix %>%
arrange(desc(value))
ggplot(melted_matrix, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "#2166ac", mid = "white", high = "#b2182b",
midpoint = 0, na.value = "gray") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6.5),
axis.text.y = element_text(angle = 0, hjust = 1, vjust = 0.5, size = 6.5),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
legend.text = element_text(size = 12),
legend.title = element_text(size = 14)
) +
labs(title = "Correlation Matrix", x = "", y = "", fill = "Correlation")

remove_na_rows <- function(df) {
cleaned_df <- df[complete.cases(df), ]
return(cleaned_df)
}
mydf <- merged_df[c("labor_productivity", "education_in_govt_exp", "Year")]
mydf <- remove_na_rows(mydf)
ggplot(mydf, aes(x = Year, y = labor_productivity, color = education_in_govt_exp)) +
geom_line(size = 1.5) +
scale_color_gradient(low = "purple", high = "yellow") +
labs(x = "Year", y = "Labor productivity", color = "Education spending share of govt. expenditure")

mydf <- merged_df[c("gdp_per_capita", "life_expectancy", "government_revenue_sharegdp")]
mydf <- remove_na_rows(mydf)
ggplot(mydf, aes(x = gdp_per_capita, y = life_expectancy, size = government_revenue_sharegdp, color = government_revenue_sharegdp)) +
geom_point(alpha = 0.7, shape = 21, stroke = 1, show.legend = FALSE) +
scale_size_continuous(range = c(3, 15)) +
scale_color_gradient(low = "blue", high = "red") +
labs(x = "GDP per capita", y = "Life expectancy", size = "Government revenue share of GDP") +
theme_minimal()

mydf <- merged_df[c("gdp_per_capita", "life_expectancy", "annual_co2_emissions")]
mydf <- remove_na_rows(mydf)
ggplot(mydf, aes(x = gdp_per_capita, y = life_expectancy, size = annual_co2_emissions, color = annual_co2_emissions)) +
geom_point(alpha = 0.7, shape = 21, stroke = 1, show.legend = FALSE) +
scale_size_continuous(range = c(3, 15)) +
scale_color_gradient(low = "blue", high = "red") +
labs(x = "GDP per capita", y = "Life expectancy", size = "Annual CO2 Emissions") +
theme_minimal()

# Subset the relevant data
co2_life <- merged_df[c("Year", "annual_co2_emissions", "life_expectancy")]
# Remove any rows with missing values
co2_life <- remove_na_rows(co2_life)
# Create the plot
ggplot(co2_life, aes(x = Year, y = annual_co2_emissions, fill = "CO2 Emissions")) +
geom_area() +
geom_line(aes(y = life_expectancy, color = "Life Expectancy")) +
scale_color_manual(values = "blue") +
scale_fill_manual(values = "red") +
labs(title = "CO2 Emissions and Life Expectancy over Time",
x = "Year",
y = "Value",
color = "",
fill = "") +
theme_minimal()

library(plotly)
mydf <- merged_df[c("research_spending_sharegdp", "education_in_govt_exp", "human_development_index")]
mydf <- remove_na_rows(mydf)
# Create a scatter plot with HDI, research spending, and education spending
fig <- plot_ly(mydf, x = ~research_spending_sharegdp, y = ~education_in_govt_exp, z = ~human_development_index,
type = "scatter3d", mode = "markers",
marker = list(size = 5, color = ~human_development_index, colorscale = "Viridis", sizemode = "diameter",
line = list(width = 1, color = "Black"))
)
# Add axis titles and set the camera position
fig <- fig %>% layout(scene = list(xaxis = list(title = "Research spending (% of GDP)"),
yaxis = list(title = "Education spending (% of total government spending)"),
zaxis = list(title = "HDI")),
margin = list(l = 0, r = 0, b = 0, t = 0),
scene_camera = list(
center = list(x = 0, y = 0, z = 0),
eye = list(x = 1.5, y = 1.5, z = 1.5))
)
fig